if (rstudioapi::isAvailable()) {
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(conflicted)
conflict_prefer("filter", "dplyr")## [conflicted] Will prefer dplyr::filter over any other package
conflict_prefer("select", "dplyr")## [conflicted] Will prefer dplyr::select over any other package
conflict_prefer("slice", "dplyr")## [conflicted] Will prefer dplyr::slice over any other package
conflict_prefer("rename", "dplyr")## [conflicted] Will prefer dplyr::rename over any other package
conflict_prefer('intersect', 'dplyr')## [conflicted] Will prefer dplyr::intersect over any other package
cq = read_tsv('../data/Supplementary_Table_3_selected_circRNAs.txt')## Rows: 1560 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (26): chr, strand, circ_id, circ_id_strand, tool, cell_line, FWD_primer,...
## dbl (21): start, end, FWD_pos, FWD_length, REV_pos, REV_length, FWD_TM, REV_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_circ = read_tsv("../data/Supplementary_Table_2_all_circRNAs.txt")## Rows: 1137099 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (16): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group...
## dbl (7): start, end, BSJ_count, n_detected, n_db, estim_len_in, BSJ_count_m...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
val = read_tsv('../data/Supplementary_Table_6A_precision_values.txt')## Rows: 29 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tool, count_group
## dbl (16): nr_qPCR_total, nr_qPCR_fail, nr_qPCR_val, nr_RR_total, nr_RR_fail,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cqall_circmake count table
cont_table = cq %>%
select(circ_id, cell_line, compound_val, count_group_median, n_db) %>%
# only keep high abundant circRNAs
filter(count_group_median == 'count ≥ 5') %>%
# filter out circ that are NAs for amplicon sequencing validation
filter(!is.na(compound_val)) %>%
# change nr of databases to binary
mutate(n_db = ifelse(is.na(n_db), 0, 1)) %>%
group_by(n_db) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup() %>%
select(-n_db)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("novel", "in_db")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## novel 20.1309 138.8691
## in_db 97.8691 675.1309
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 180.95, df = 1, p-value < 2.2e-16
OR
OR = (cont_table[2,2]/cont_table[2,1])/(cont_table[1,2]/cont_table[1,1])
OR## [1] 13.07946
make count table
cont_table = cq %>%
select(circ_id, cell_line, compound_val, count_group_median, n_detected) %>%
unique() %>%
filter(count_group_median == 'count ≥ 5') %>%
# filter out NAs
filter(!is.na(compound_val)) %>%
# change nr of tools to binary
mutate(n_detected = ifelse(n_detected == 1, 0, 1)) %>%
group_by(n_detected) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup() %>%
select(-n_detected)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("unique", "multiple_tools")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## unique 10.62444 70.37556
## multiple_tools 106.37556 704.62444
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 333.13, df = 1, p-value < 2.2e-16
OR
OR = (cont_table[2,2]/cont_table[2,1])/(cont_table[1,2]/cont_table[1,1])
OR## [1] 53.8424
make count table
cont_table = cq %>%
select(circ_id, cell_line, compound_val, count_group_median, nr_exons) %>%
unique() %>%
filter(count_group_median == 'count ≥ 5') %>%
# filter out NAs
filter(!is.na(compound_val)) %>%
# also remove the ones that do not have a exon annotation
filter(!is.na(nr_exons), !nr_exons == "ambiguous") %>%
mutate(exon_bin = ifelse(nr_exons == 1, 0, 1)) %>%
group_by(exon_bin) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup() %>%
mutate(fail = ifelse(is.na(fail), 0, fail)) %>%
select(-exon_bin)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("single_exon", "multi_exons")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## single_exon 9.539216 129.4608
## multi_exons 39.460784 535.5392
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 19.995, df = 1, p-value = 7.766e-06
OR
OR = (cont_table[2,2]/cont_table[2,1])/(cont_table[1,2]/cont_table[1,1])
OR## [1] 3.816398
cq_start = cq %>%
mutate(start_exon_nr = substr(start_match, 19, 27),
start_exon_nr = ifelse(substr(start_exon_nr, 1, 1) == '_',
substr(start_exon_nr, 2, 10),
start_exon_nr))
cq_start cont_table = cq_start %>%
filter(count_group_median == 'count ≥ 5') %>%
select(circ_id, cell_line, compound_val, start_exon_nr) %>%
unique() %>%
filter(!is.na(start_exon_nr)) %>%
mutate(exon_1 = ifelse(start_exon_nr == "exon_1", 1, 0)) %>%
# filter out NAs
filter(!is.na(compound_val)) %>%
group_by(exon_1) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup()
cont_table=> not enough values
cont_table = cq %>%
filter(count_group_median == 'count ≥ 5',
!strand == 'unknown') %>%
select(circ_id, cell_line, compound_val, ss_motif) %>%
unique() %>%
mutate(ss_can = ifelse(ss_motif == "AGNGT", 1, 0)) %>%
# filter out NAs
filter(!is.na(compound_val)) %>%
# change nr of databases to binary
group_by(ss_can) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup() %>%
select(-ss_can)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("non_cannonical", "cannonical")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## non_cannonical 13.92493 99.07507
## cannonical 73.07507 519.92493
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 45.392, df = 1, p-value = 1.613e-11
OR
OR = (cont_table[2,2]/cont_table[2,1])/(cont_table[1,2]/cont_table[1,1])
OR## [1] 4.968678
cont_table = cq %>%
filter(count_group_median == "count ≥ 5") %>%
select(circ_id, cell_line, compound_val, ENST) %>%
unique() %>%
# filter out NAs
filter(!is.na(compound_val)) %>%
# make ENST match binary
mutate(ENST_group = ifelse(is.na(ENST), 'no_match', "match")) %>%
# change nr of databases to binary
group_by(ENST_group) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup() %>%
select(-ENST_group)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("match", "no_match")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## match 105.07727 696.92273
## no_match 11.92273 79.07727
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 185.78, df = 1, p-value < 2.2e-16
OR
OR = (cont_table[1,2]/cont_table[1,1])/(cont_table[2,2]/cont_table[2,1])
OR## [1] 17.11969
cont_table = cq %>%
filter(count_group_median == "count ≥ 5") %>%
left_join(read_tsv('../data/details/tool_annotation.txt')) %>%
select(circ_id, cell_line, compound_val, approach) %>%
filter(!approach == 'integrative') %>%
unique() %>%
# filter out NAs
filter(!is.na(compound_val)) %>%
group_by(approach) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup() %>%
select(-approach)## Rows: 16 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool, approach, tool_lt, lin_annotation, strand_anno, splicing, BSJ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining, by = "tool"
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("candidate-based", "segmented read-based")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## candidate-based 26.26401 158.736
## segmented read-based 87.73599 530.264
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 9.3937, df = 1, p-value = 0.002177
OR
OR = (cont_table[1,2]/cont_table[1,1])/(cont_table[2,2]/cont_table[2,1])
OR## [1] 2.584734
cont_table = cq %>%
select(circ_id, cell_line, compound_val, count_group_median) %>%
unique() %>%
# filter out NAs
filter(!is.na(compound_val)) %>%
group_by(count_group_median) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup() %>%
select(-count_group_median)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("count < 5", "count ≥ 5")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## count < 5 54.04071 232.9593
## count ≥ 5 167.95929 724.0407
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 76.721, df = 1, p-value < 2.2e-16
OR
OR = (cont_table[2,2]/cont_table[2,1])/(cont_table[1,2]/cont_table[1,1])
OR## [1] 3.821499
tool_anno = read_tsv('../data/details/tool_annotation.txt')## Rows: 16 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool, approach, tool_lt, lin_annotation, strand_anno, splicing, BSJ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sens = read_tsv('../data/Supplementary_Table_6B_sensitivity_values.txt')## Rows: 32 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tool, count_group_median
## dbl (3): nr_detected, nr_expected, sensitivity
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
add annotation to sensitivity
sens_anno = sens %>%
left_join(tool_anno) %>%
select(-tool_lt) %>% ungroup() %>%
filter(count_group_median == 'count ≥ 5')## Joining, by = "tool"
sens_annocq %>% select(circ_id, compound_val, cell_line) %>% unique() %>% count(compound_val)wilcox.test(sensitivity ~ approach, data=sens_anno %>% filter(!approach == 'integrative')) ##
## Wilcoxon rank sum exact test
##
## data: sensitivity by approach
## W = 6, p-value = 0.1264
## alternative hypothesis: true location shift is not equal to 0
kruskal.test(sensitivity ~ approach, data=sens_anno)##
## Kruskal-Wallis rank sum test
##
## data: sensitivity by approach
## Kruskal-Wallis chi-squared = 3.3122, df = 2, p-value = 0.1909
wilcox.test(sensitivity ~ lin_annotation, data=sens_anno)## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: sensitivity by lin_annotation
## W = 24, p-value = 0.5902
## alternative hypothesis: true location shift is not equal to 0
kruskal.test(sensitivity ~ strand_anno, data=sens_anno %>%
filter(!strand_anno == "no strand reported"))##
## Kruskal-Wallis rank sum test
##
## data: sensitivity by strand_anno
## Kruskal-Wallis chi-squared = 3.9841, df = 3, p-value = 0.2632
wilcox_ss = wilcox.test(sensitivity ~ splicing, data=sens_anno) ## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
wilcox_ss##
## Wilcoxon rank sum test with continuity correction
##
## data: sensitivity by splicing
## W = 55, p-value = 0.002206
## alternative hypothesis: true location shift is not equal to 0
sens_anno %>%
group_by(splicing) %>%
summarise(mean_sens = mean(sensitivity))N = nrow(sens_anno)
N## [1] 16
Z = qnorm(wilcox_ss$p.value/2)
Z## [1] -3.061034
r = abs(Z)/sqrt(N)
r## [1] 0.7652585
library(rstatix)
library(ggpubr)
sens_anno %>% rstatix::wilcox_test(sensitivity ~ splicing)sens_anno %>% rstatix::wilcox_effsize(sensitivity ~ splicing)#sens_anno %>% rstatix::wilcox_effsize(sensitivity ~ splicing, ci = T)median_diff = tibble()
for (tool_can in sens_anno %>% filter(splicing == "canonical") %>% pull(tool)) {
sens_can = sens_anno %>% filter(tool == tool_can) %>% pull(sensitivity)
for (tool_non_can in sens_anno %>% filter(splicing == "non-canonical") %>% pull(tool)) {
sens_non_can = sens_anno %>% filter(tool == tool_non_can) %>% pull(sensitivity)
median_diff = median_diff %>%
bind_rows(tibble(tool_can, sens_can, tool_non_can, sens_non_can))
}
}
median_diff = median_diff %>%
mutate(sens_diff = sens_can - sens_non_can)
median_diffmedian_diff %>% pull(sens_diff) %>% median()## [1] 0.3845161
wilcox.test(sensitivity ~ BSJ_filter, data=sens_anno) ## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: sensitivity by BSJ_filter
## W = 28, p-value = 1
## alternative hypothesis: true location shift is not equal to 0
val_cor = val %>%
# use perc_compound_val
select(tool, count_group, perc_compound_val) %>%
# get the number of detected circRNAs for each cell line and tool, per count group
left_join(all_circ %>%
group_by(cell_line, tool, count_group) %>%
summarise(total_n_ut = n())) %>%
# calculate the theoretical nr of validated circ
mutate(theo_TP_all = perc_compound_val * total_n_ut) %>%
left_join(sens, by = c('count_group' = 'count_group_median', 'tool')) %>%
group_by(tool, count_group, perc_compound_val, sensitivity) %>%
summarize(theo_TP_all_cl = sum(theo_TP_all)) %>% ungroup()## `summarise()` has grouped output by 'cell_line', 'tool'. You can override using
## the `.groups` argument.
## Joining, by = c("tool", "count_group")
## `summarise()` has grouped output by 'tool', 'count_group', 'perc_compound_val'.
## You can override using the `.groups` argument.
val_corbelow 5
val_cor_tmp = val_cor %>% filter(count_group == "count < 5")
cor.test(val_cor_tmp$sensitivity, val_cor_tmp$theo_TP_all_cl, method = 'spearman')## Warning in cor.test.default(val_cor_tmp$sensitivity,
## val_cor_tmp$theo_TP_all_cl, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: val_cor_tmp$sensitivity and val_cor_tmp$theo_TP_all_cl
## S = 58.159, p-value = 0.0003236
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8402236
above 5
val_cor_tmp = val_cor %>% filter(count_group == "count ≥ 5")
cor.test(val_cor_tmp$sensitivity, val_cor_tmp$theo_TP_all_cl, method = 'spearman')## Warning in cor.test.default(val_cor_tmp$sensitivity,
## val_cor_tmp$theo_TP_all_cl, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: val_cor_tmp$sensitivity and val_cor_tmp$theo_TP_all_cl
## S = 112.6, p-value = 0.0003534
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.7989279
make count table
cont_table = cq %>%
select(circ_id, cell_line, compound_val, count_group_median) %>%
unique() %>%
# only keep high abundant circRNAs
filter(count_group_median == 'count ≥ 5') %>%
# to use all val together
filter(!is.na(compound_val)) %>%
# change nr of databases to binary
group_by(cell_line) %>%
count(compound_val) %>%
pivot_wider(values_from = n, names_from = compound_val) %>%
ungroup() %>%
select(-cell_line)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("HLF", "NCI-H23", "SW480")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## HLF 46.56390 308.4361
## NCI-H23 37.51345 248.4865
## SW480 32.92265 218.0774
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test
##
## data: cont_table
## X-squared = 24.997, df = 2, p-value = 3.732e-06
first calculate the total nr of validated circ per count group
nr_val_cl = cq %>%
# get set of uniquely validated circRNAs
filter(compound_val == 'pass') %>%
select(circ_id, cell_line, count_group_median) %>% unique() %>%
group_by(count_group_median, cell_line) %>%
summarise(nr_expected = n())## `summarise()` has grouped output by 'count_group_median'. You can override
## using the `.groups` argument.
nr_val_clthen calculate sensitivity by dividing nr of circ found by total
sens_cl = cq %>%
# get set of uniquely validated circRNAs
filter(compound_val == 'pass') %>%
select(circ_id, cell_line, count_group_median) %>% unique() %>%
# check witch tools have detected these
left_join(all_circ %>%
select(tool, circ_id, cell_line) %>% unique()) %>%
group_by(tool, count_group_median, cell_line) %>%
summarise(nr_detected = n()) %>%
left_join(nr_val_cl) %>%
mutate(sensitivity = nr_detected/nr_expected) %>%
ungroup()## Joining, by = c("circ_id", "cell_line")
## `summarise()` has grouped output by 'tool', 'count_group_median'. You can
## override using the `.groups` argument.
## Joining, by = c("count_group_median", "cell_line")
then ANOVA
sens_cl_anova = sens_cl %>%
filter(count_group_median == 'count ≥ 5') %>%
select(tool, cell_line, sensitivity)
sens_cl_anovasens_cl_anova$cell_line = factor(sens_cl_anova$cell_line, levels = c('HLF', 'NCI-H23', 'SW480'))
sens_cl_anova$tool = factor(sens_cl_anova$tool, levels = c("CIRCexplorer3", "CirComPara2", "circRNA_finder", "circseq_cup", "CircSplice", "circtools","CIRI2", "CIRIquant", "ecircscreen","find_circ", "KNIFE", "NCLcomparator", "NCLscan", "PFv2","Sailfish-cir", "segemehl"))
res.aov = aov(sensitivity ~ cell_line+tool, data = sens_cl_anova)
summary(res.aov)## Df Sum Sq Mean Sq F value Pr(>F)
## cell_line 2 0.0036 0.00181 1.274 0.295
## tool 15 1.8085 0.12057 84.665 <2e-16 ***
## Residuals 30 0.0427 0.00142
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
0.0036/(0.0036+1.8085+0.0427)## [1] 0.00194091
1.8085/(0.0036+1.8085+0.0427)## [1] 0.9750377
recalculate the precision per cell line
val_cl = cq %>%
select(tool, circ_id, cell_line, count_group, qPCR_val, RR_val, RR_val_detail, amp_seq_val, amp_seq_val_detail, compound_val) %>%
group_by(tool, count_group, cell_line) %>%
summarise(nr_qPCR_total = n(),
nr_qPCR_fail = sum(qPCR_val == 'fail'),
nr_qPCR_val = sum(qPCR_val == 'pass'),
nr_RR_total = n() - sum(is.na(RR_val)), # here NA are the ones that have are 'out_of_range'
nr_RR_fail = sum(RR_val == "fail", na.rm = T),
nr_RR_val = sum(RR_val == "pass", na.rm = T),
nr_amp_total = n() - sum(is.na(amp_seq_val)), # here NA are the ones 'not_included'
nr_amp_fail = sum(amp_seq_val == "fail", na.rm = T),
nr_amp_val = sum(amp_seq_val == "pass", na.rm = T),
nr_compound_total = n() - sum(is.na(amp_seq_val)), # here NA are the ones 'not_included
nr_compound_fail = sum(compound_val == "fail", na.rm = T),
nr_compound_val = sum(compound_val == "pass", na.rm = T)) %>%
mutate(perc_qPCR_val = nr_qPCR_val/nr_qPCR_total,
perc_RR_val = nr_RR_val/nr_RR_total,
perc_amp_val = nr_amp_val/nr_amp_total,
perc_compound_val = nr_compound_val/nr_compound_total) %>%
ungroup()## `summarise()` has grouped output by 'tool', 'count_group'. You can override
## using the `.groups` argument.
ANOVA
val_cl_anova = val_cl %>%
filter(count_group == 'count ≥ 5') %>%
select(tool, cell_line, perc_qPCR_val, perc_RR_val, perc_amp_val, perc_compound_val)
val_cl_anovaval_cl_anova$cell_line = factor(val_cl_anova$cell_line, levels = c('HLF', 'NCI-H23', 'SW480'))
val_cl_anova$tool = factor(val_cl_anova$tool, levels = c("CIRCexplorer3", "CirComPara2", "circRNA_finder", "circseq_cup", "CircSplice", "circtools","CIRI2", "CIRIquant", "ecircscreen","find_circ", "KNIFE", "NCLcomparator", "NCLscan", "PFv2","Sailfish-cir", "segemehl"))res.aov = aov(perc_qPCR_val ~ cell_line+tool, data = val_cl_anova)
summary(res.aov)## Df Sum Sq Mean Sq F value Pr(>F)
## cell_line 2 0.008126 0.004063 6.586 0.00453 **
## tool 14 0.024932 0.001781 2.887 0.00821 **
## Residuals 28 0.017273 0.000617
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
0.008126/(0.008126+0.024932+0.017273)## [1] 0.1614512
0.024932/(0.008126+0.024932+0.017273)## [1] 0.4953607
res.aov = aov(perc_RR_val ~ cell_line+tool, data = val_cl_anova)
summary(res.aov)## Df Sum Sq Mean Sq F value Pr(>F)
## cell_line 2 0.05260 0.026300 8.642 0.00119 **
## tool 14 0.16010 0.011436 3.758 0.00140 **
## Residuals 28 0.08521 0.003043
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
0.026300/(0.026300+0.011436+0.003043)## [1] 0.6449398
0.011436/(0.026300+0.011436+0.003043)## [1] 0.2804385
res.aov = aov(perc_amp_val ~ cell_line+tool, data = val_cl_anova)
summary(res.aov)## Df Sum Sq Mean Sq F value Pr(>F)
## cell_line 2 0.0182 0.00912 3.269 0.053 .
## tool 14 1.2605 0.09004 32.281 1.47e-13 ***
## Residuals 28 0.0781 0.00279
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
0.00912/(0.00912+0.09004+0.00279)## [1] 0.08945562
0.09004/(0.00912+0.09004+0.00279)## [1] 0.883178
res.aov = aov(perc_compound_val ~ cell_line+tool, data = val_cl_anova)
summary(res.aov)## Df Sum Sq Mean Sq F value Pr(>F)
## cell_line 2 0.1241 0.06205 17.31 1.28e-05 ***
## tool 14 1.3168 0.09406 26.24 2.06e-12 ***
## Residuals 28 0.1004 0.00359
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
0.06205/(0.06205+0.09406+0.00359)## [1] 0.388541
0.09406/(0.06205+0.09406+0.00359)## [1] 0.5889793
treatment = read_tsv('../data/Supplementary_Table_5_RNase_R_enrichment_seq.txt')## Rows: 2303689 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (8): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group,...
## dbl (9): start, end, count_UT, count_T, nr_reads_UT, nr_reads_T, cpm_T, cpm_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cont_table = treatment %>%
# remove Sailfish-cir as it does not report counts
filter(!tool == 'Sailfish-cir') %>%
# remove NAs
filter(!is.na(count_UT), !is.na(count_T)) %>%
# add median count group
left_join(all_circ %>%
select(circ_id_strand, count_group_median, cell_line) %>% unique()) %>%
select(circ_id_strand, cell_line, tool, enrichment_bin, count_group_median) %>%
group_by(count_group_median) %>%
count(enrichment_bin) %>%
pivot_wider(values_from = n, names_from = enrichment_bin) %>%
ungroup() %>%
select(-count_group_median)## Joining, by = c("cell_line", "circ_id_strand")
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("count < 5", "count ≥ 5")
cont_tablemosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)chisq.test(cont_table)$expected## enriched not enriched
## count < 5 307817 54224.03
## count ≥ 5 114801 20222.97
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 1067, df = 1, p-value < 2.2e-16
OR
OR = (cont_table[1,2]/cont_table[1,1])/(cont_table[2,2]/cont_table[2,1])
OR## [1] 1.360636